[SPARK-31892][SQL] Disable week-based date filed for parsing

### What changes were proposed in this pull request?

This PR disables week-based date filed for parsing

closes #28674
### Why are the changes needed?

1. It's an un-fixable behavior change to fill the gap between SimpleDateFormat and DateTimeFormater and backward-compatibility for different JDKs.A lot of effort has been made to prove it at https://github.com/apache/spark/pull/28674

2. The existing behavior itself in 2.4 is confusing, e.g.

```sql
spark-sql> select to_timestamp('1', 'w');
1969-12-28 00:00:00
spark-sql> select to_timestamp('1', 'u');
1970-01-05 00:00:00
```
  The 'u' here seems not to go to the Monday of the first week in week-based form or the first day of the year in non-week-based form but go to the Monday of the second week in week-based form.

And, e.g.
```sql
spark-sql> select to_timestamp('2020 2020', 'YYYY yyyy');
2020-01-01 00:00:00
spark-sql> select to_timestamp('2020 2020', 'yyyy YYYY');
2019-12-29 00:00:00
spark-sql> select to_timestamp('2020 2020 1', 'YYYY yyyy w');
NULL
spark-sql> select to_timestamp('2020 2020 1', 'yyyy YYYY w');
2019-12-29 00:00:00
```

  I think we don't need to introduce all the weird behavior from Java

3. The current test coverage for week-based date fields is almost 0%, which indicates that we've never imagined using it.

4. Avoiding JDK bugs

https://issues.apache.org/jira/browse/SPARK-31880

### Does this PR introduce _any_ user-facing change?

Yes, the 'Y/W/w/u/F/E' pattern cannot be used datetime parsing functions.

### How was this patch tested?

more tests added

Closes #28706 from yaooqinn/SPARK-31892.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Kent Yao 2020-06-03 06:49:58 +00:00 committed by Wenchen Fan
parent c59f51bcc2
commit afe95bd9ad
18 changed files with 102 additions and 48 deletions

View file

@ -136,6 +136,8 @@ The count of pattern letters determines the format.
During formatting, all valid data will be output even it is in the optional section.
During parsing, the whole section may be missing from the parsed string.
An optional section is started by `[` and ended using `]` (or at the end of the pattern).
- Symbols of 'Y', 'W', 'w', 'E', 'u', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
More details for the text style:

View file

@ -525,7 +525,7 @@ object CatalogColumnStat extends Logging {
TimestampFormatter(
format = "yyyy-MM-dd HH:mm:ss.SSSSSS",
zoneId = ZoneOffset.UTC,
needVarLengthSecondFraction = isParsing)
isParsing = isParsing)
}
/**

View file

@ -35,7 +35,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
private val decimalParser = if (options.locale == Locale.US) {
// Special handling the default locale for backward compatibility

View file

@ -47,12 +47,13 @@ class UnivocityGenerator(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
private val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = false)
private def makeConverter(dataType: DataType): ValueConverter = dataType match {
case DateType =>

View file

@ -90,12 +90,13 @@ class UnivocityParser(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
private lazy val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = true)
private val csvFilters = new CSVFilters(filters, requiredSchema)

View file

@ -734,7 +734,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
format.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
}
} else None
}
@ -745,7 +745,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
format.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
} else {
formatter.get
}
@ -890,7 +890,7 @@ abstract class ToTimestamp
constFormat.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
} catch {
case e: SparkUpgradeException => throw e
case NonFatal(_) => null
@ -929,7 +929,7 @@ abstract class ToTimestamp
formatString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
.parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor
} catch {
case e: SparkUpgradeException => throw e
@ -1072,7 +1072,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
constFormat.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
} catch {
case e: SparkUpgradeException => throw e
case NonFatal(_) => null
@ -1105,7 +1105,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
f.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
} catch {
case e: SparkUpgradeException => throw e

View file

@ -83,12 +83,13 @@ private[sql] class JacksonGenerator(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
private val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = false)
private def makeWriter(dataType: DataType): ValueWriter = dataType match {
case NullType =>

View file

@ -61,12 +61,13 @@ class JacksonParser(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
private lazy val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = true)
/**
* Create a converter which converts the JSON documents held by the `JsonParser`

View file

@ -43,7 +43,7 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
/**
* Infer the type of a collection of json records in three stages:

View file

@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.util
import java.text.SimpleDateFormat
import java.time.{LocalDate, ZoneId}
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import org.apache.commons.lang3.time.FastDateFormat
@ -42,7 +41,8 @@ class Iso8601DateFormatter(
pattern: String,
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormats.LegacyDateFormat)
legacyFormat: LegacyDateFormats.LegacyDateFormat,
isParsing: Boolean)
extends DateFormatter with DateTimeFormatterHelper {
@transient
@ -131,12 +131,13 @@ object DateFormatter {
format: Option[String],
zoneId: ZoneId,
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT): DateFormatter = {
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
isParsing: Boolean = true): DateFormatter = {
val pattern = format.getOrElse(defaultPattern)
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
} else {
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat)
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat, isParsing)
df.validatePatternString()
df
}
@ -159,8 +160,9 @@ object DateFormatter {
format: String,
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormat): DateFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat)
legacyFormat: LegacyDateFormat,
isParsing: Boolean): DateFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
}
def apply(format: String, zoneId: ZoneId): DateFormatter = {

View file

@ -89,9 +89,9 @@ trait DateTimeFormatterHelper {
protected def getOrCreateFormatter(
pattern: String,
locale: Locale,
needVarLengthSecondFraction: Boolean = false): DateTimeFormatter = {
val newPattern = convertIncompatiblePattern(pattern)
val useVarLen = needVarLengthSecondFraction && newPattern.contains('S')
isParsing: Boolean = false): DateTimeFormatter = {
val newPattern = convertIncompatiblePattern(pattern, isParsing)
val useVarLen = isParsing && newPattern.contains('S')
val key = (newPattern, locale, useVarLen)
var formatter = cache.getIfPresent(key)
if (formatter == null) {
@ -227,6 +227,12 @@ private object DateTimeFormatterHelper {
formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
}
final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
// SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
// to datetime, especially when they are mixed with other non-week-based ones
// The quarter fields will also be parsed strangely, e.g. when the pattern contains `yMd` and can
// be directly resolved then the `q` do check for whether the month is valid, but if the date
// fields is incomplete, e.g. `yM`, the checking will be bypassed.
final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F', 'q', 'Q')
final val unsupportedPatternLengths = {
// SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
// 2.4
@ -246,7 +252,7 @@ private object DateTimeFormatterHelper {
* @param pattern The input pattern.
* @return The pattern for new parser
*/
def convertIncompatiblePattern(pattern: String): String = {
def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
case (patternPart, index) =>
// Text can be quoted using single quotes, we only check the non-quote parts.
@ -255,7 +261,8 @@ private object DateTimeFormatterHelper {
(pattern + " ").split("'").zipWithIndex.map {
case (patternPart, index) =>
if (index % 2 == 0) {
for (c <- patternPart if unsupportedLetters.contains(c)) {
for (c <- patternPart if unsupportedLetters.contains(c) ||
(isParsing && unsupportedLettersForParsing.contains(c))) {
throw new IllegalArgumentException(s"Illegal pattern character: $c")
}
for (style <- unsupportedPatternLengths if patternPart.contains(style)) {

View file

@ -293,13 +293,13 @@ object TimestampFormatter {
zoneId: ZoneId,
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
isParsing: Boolean = false): TimestampFormatter = {
val pattern = format.getOrElse(defaultPattern)
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
} else {
val tf = new Iso8601TimestampFormatter(
pattern, zoneId, locale, legacyFormat, needVarLengthSecondFraction)
pattern, zoneId, locale, legacyFormat, isParsing)
tf.validatePatternString()
tf
}
@ -325,23 +325,23 @@ object TimestampFormatter {
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormat,
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, needVarLengthSecondFraction)
isParsing: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
}
def apply(
format: String,
zoneId: ZoneId,
legacyFormat: LegacyDateFormat,
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, needVarLengthSecondFraction)
isParsing: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, isParsing)
}
def apply(
format: String,
zoneId: ZoneId,
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
getFormatter(Some(format), zoneId, needVarLengthSecondFraction = needVarLengthSecondFraction)
isParsing: Boolean = false): TimestampFormatter = {
getFormatter(Some(format), zoneId, isParsing = isParsing)
}
def apply(zoneId: ZoneId): TimestampFormatter = {

View file

@ -1168,4 +1168,33 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkExceptionInExpression[ArithmeticException](
MillisToTimestamp(Literal(-92233720368547758L)), "long overflow")
}
test("Disable week-based date fields and quarter fields for parsing") {
def checkSparkUpgrade(c: Char): Unit = {
checkExceptionInExpression[SparkUpgradeException](
new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, "3.0")
checkExceptionInExpression[SparkUpgradeException](
new ParseToDate(Literal("1"), Literal(c.toString)).child, "3.0")
checkExceptionInExpression[SparkUpgradeException](
ToUnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
checkExceptionInExpression[SparkUpgradeException](
UnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
}
def checkNullify(c: Char): Unit = {
checkEvaluation(new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, null)
checkEvaluation(new ParseToDate(Literal("1"), Literal(c.toString)).child, null)
checkEvaluation(ToUnixTimestamp(Literal("1"), Literal(c.toString)), null)
checkEvaluation(UnixTimestamp(Literal("1"), Literal(c.toString)), null)
}
Seq('Y', 'W', 'w', 'E', 'u', 'F').foreach { l =>
checkSparkUpgrade(l)
}
Seq('q', 'Q').foreach { l =>
checkNullify(l)
}
}
}

View file

@ -17,7 +17,7 @@
package org.apache.spark.sql.catalyst.util
import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._
class DateTimeFormatterHelperSuite extends SparkFunSuite {
@ -40,6 +40,13 @@ class DateTimeFormatterHelperSuite extends SparkFunSuite {
val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
assert(e.getMessage === s"Illegal pattern character: $l")
}
unsupportedLettersForParsing.foreach { l =>
val e = intercept[IllegalArgumentException] {
convertIncompatiblePattern(s"$l", isParsing = true)
}
assert(e.getMessage === s"Illegal pattern character: $l")
assert(convertIncompatiblePattern(s"$l").nonEmpty)
}
unsupportedPatternLengths.foreach { style =>
val e1 = intercept[IllegalArgumentException] {
convertIncompatiblePattern(s"yyyy-MM-dd $style")

View file

@ -72,7 +72,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
val days = formatter.parse(date)
assert(date === formatter.format(days))
assert(date === formatter.format(daysToLocalDate(days)))
@ -106,7 +107,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
val date = formatter.format(days)
val parsed = formatter.parse(date)
assert(days === parsed)
@ -173,7 +175,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
assert(LocalDate.ofEpochDay(formatter.parse("1000-01-01")) === LocalDate.of(1000, 1, 1))
assert(formatter.format(LocalDate.of(1000, 1, 1)) === "1000-01-01")
assert(formatter.format(localDateToDays(LocalDate.of(1000, 1, 1))) === "1000-01-01")

View file

@ -48,7 +48,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
val formatter = TimestampFormatter(
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
getZoneId(zoneId),
needVarLengthSecondFraction = true)
isParsing = true)
val microsSinceEpoch = formatter.parse(localDate)
assert(microsSinceEpoch === expectedMicros(zoneId))
}
@ -73,7 +73,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
// Test only FAST_DATE_FORMAT because other legacy formats don't support formatting
// in microsecond precision.
LegacyDateFormats.FAST_DATE_FORMAT,
needVarLengthSecondFraction = false),
isParsing = false),
TimestampFormatter.getFractionFormatter(getZoneId(zoneId))).foreach { formatter =>
val timestamp = formatter.format(microsSinceEpoch)
assert(timestamp === expectedTimestamp(zoneId))
@ -98,7 +98,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
outstandingZoneIds.foreach { zoneId =>
val timestamp = TimestampFormatter(pattern, zoneId).format(micros)
val parsed = TimestampFormatter(
pattern, zoneId, needVarLengthSecondFraction = true).parse(timestamp)
pattern, zoneId, isParsing = true).parse(timestamp)
assert(micros === parsed)
}
}
@ -119,7 +119,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
outstandingZoneIds.foreach { zoneId =>
val pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSSSS"
val micros = TimestampFormatter(
pattern, zoneId, needVarLengthSecondFraction = true).parse(timestamp)
pattern, zoneId, isParsing = true).parse(timestamp)
val formatted = TimestampFormatter(pattern, zoneId).format(micros)
assert(timestamp === formatted)
}
@ -185,7 +185,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
test("parsing timestamp strings with various seconds fractions") {
outstandingZoneIds.foreach { zoneId =>
def check(pattern: String, input: String, reference: String): Unit = {
val formatter = TimestampFormatter(pattern, zoneId, needVarLengthSecondFraction = true)
val formatter = TimestampFormatter(pattern, zoneId, isParsing = true)
val expected = stringToTimestamp(UTF8String.fromString(reference), zoneId).get
val actual = formatter.parse(input)
assert(actual === expected)
@ -292,7 +292,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
zoneId,
TimestampFormatter.defaultLocale,
legacyFormat,
needVarLengthSecondFraction = false)
isParsing = false)
}.toSeq :+ TimestampFormatter.getFractionFormatter(zoneId)
formatters.foreach { formatter =>
assert(microsToInstant(formatter.parse("1000-01-01 01:02:03"))

View file

@ -134,7 +134,7 @@ object PartitioningUtils {
val timestampFormatter = TimestampFormatter(
timestampPartitionPattern,
zoneId,
needVarLengthSecondFraction = true)
isParsing = true)
// First, we need to parse every partition's path and see if we can find partition values.
val (partitionValues, optDiscoveredBasePaths) = paths.map { path =>
parsePartition(path, typeInference, basePaths, userSpecifiedDataTypes,

View file

@ -60,7 +60,7 @@ abstract class ParquetPartitionDiscoverySuite
val timeZoneId = ZoneId.systemDefault()
val df = DateFormatter(timeZoneId)
val tf = TimestampFormatter(
timestampPartitionPattern, timeZoneId, needVarLengthSecondFraction = true)
timestampPartitionPattern, timeZoneId, isParsing = true)
protected override def beforeAll(): Unit = {
super.beforeAll()