[SPARK-31892][SQL] Disable week-based date filed for parsing

### What changes were proposed in this pull request?

This PR disables week-based date filed for parsing

closes #28674
### Why are the changes needed?

1. It's an un-fixable behavior change to fill the gap between SimpleDateFormat and DateTimeFormater and backward-compatibility for different JDKs.A lot of effort has been made to prove it at https://github.com/apache/spark/pull/28674

2. The existing behavior itself in 2.4 is confusing, e.g.

```sql
spark-sql> select to_timestamp('1', 'w');
1969-12-28 00:00:00
spark-sql> select to_timestamp('1', 'u');
1970-01-05 00:00:00
```
  The 'u' here seems not to go to the Monday of the first week in week-based form or the first day of the year in non-week-based form but go to the Monday of the second week in week-based form.

And, e.g.
```sql
spark-sql> select to_timestamp('2020 2020', 'YYYY yyyy');
2020-01-01 00:00:00
spark-sql> select to_timestamp('2020 2020', 'yyyy YYYY');
2019-12-29 00:00:00
spark-sql> select to_timestamp('2020 2020 1', 'YYYY yyyy w');
NULL
spark-sql> select to_timestamp('2020 2020 1', 'yyyy YYYY w');
2019-12-29 00:00:00
```

  I think we don't need to introduce all the weird behavior from Java

3. The current test coverage for week-based date fields is almost 0%, which indicates that we've never imagined using it.

4. Avoiding JDK bugs

https://issues.apache.org/jira/browse/SPARK-31880

### Does this PR introduce _any_ user-facing change?

Yes, the 'Y/W/w/u/F/E' pattern cannot be used datetime parsing functions.

### How was this patch tested?

more tests added

Closes #28706 from yaooqinn/SPARK-31892.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Kent Yao 2020-06-03 06:49:58 +00:00 committed by Wenchen Fan
parent c59f51bcc2
commit afe95bd9ad
18 changed files with 102 additions and 48 deletions

View file

@ -137,6 +137,8 @@ The count of pattern letters determines the format.
During parsing, the whole section may be missing from the parsed string.
An optional section is started by `[` and ended using `]` (or at the end of the pattern).
- Symbols of 'Y', 'W', 'w', 'E', 'u', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
More details for the text style:
- Short Form: Short text, typically an abbreviation. For example, day-of-week Monday might output "Mon".

View file

@ -525,7 +525,7 @@ object CatalogColumnStat extends Logging {
TimestampFormatter(
format = "yyyy-MM-dd HH:mm:ss.SSSSSS",
zoneId = ZoneOffset.UTC,
needVarLengthSecondFraction = isParsing)
isParsing = isParsing)
}
/**

View file

@ -35,7 +35,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
private val decimalParser = if (options.locale == Locale.US) {
// Special handling the default locale for backward compatibility

View file

@ -47,12 +47,13 @@ class UnivocityGenerator(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
private val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = false)
private def makeConverter(dataType: DataType): ValueConverter = dataType match {
case DateType =>

View file

@ -90,12 +90,13 @@ class UnivocityParser(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
private lazy val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = true)
private val csvFilters = new CSVFilters(filters, requiredSchema)

View file

@ -734,7 +734,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
format.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
}
} else None
}
@ -745,7 +745,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
format.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
} else {
formatter.get
}
@ -890,7 +890,7 @@ abstract class ToTimestamp
constFormat.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
} catch {
case e: SparkUpgradeException => throw e
case NonFatal(_) => null
@ -929,7 +929,7 @@ abstract class ToTimestamp
formatString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
.parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor
} catch {
case e: SparkUpgradeException => throw e
@ -1072,7 +1072,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
constFormat.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
} catch {
case e: SparkUpgradeException => throw e
case NonFatal(_) => null
@ -1105,7 +1105,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
f.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
} catch {
case e: SparkUpgradeException => throw e

View file

@ -83,12 +83,13 @@ private[sql] class JacksonGenerator(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
private val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = false)
private def makeWriter(dataType: DataType): ValueWriter = dataType match {
case NullType =>

View file

@ -61,12 +61,13 @@ class JacksonParser(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
private lazy val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = true)
/**
* Create a converter which converts the JSON documents held by the `JsonParser`

View file

@ -43,7 +43,7 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
/**
* Infer the type of a collection of json records in three stages:

View file

@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.util
import java.text.SimpleDateFormat
import java.time.{LocalDate, ZoneId}
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import org.apache.commons.lang3.time.FastDateFormat
@ -42,7 +41,8 @@ class Iso8601DateFormatter(
pattern: String,
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormats.LegacyDateFormat)
legacyFormat: LegacyDateFormats.LegacyDateFormat,
isParsing: Boolean)
extends DateFormatter with DateTimeFormatterHelper {
@transient
@ -131,12 +131,13 @@ object DateFormatter {
format: Option[String],
zoneId: ZoneId,
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT): DateFormatter = {
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
isParsing: Boolean = true): DateFormatter = {
val pattern = format.getOrElse(defaultPattern)
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
} else {
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat)
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat, isParsing)
df.validatePatternString()
df
}
@ -159,8 +160,9 @@ object DateFormatter {
format: String,
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormat): DateFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat)
legacyFormat: LegacyDateFormat,
isParsing: Boolean): DateFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
}
def apply(format: String, zoneId: ZoneId): DateFormatter = {

View file

@ -89,9 +89,9 @@ trait DateTimeFormatterHelper {
protected def getOrCreateFormatter(
pattern: String,
locale: Locale,
needVarLengthSecondFraction: Boolean = false): DateTimeFormatter = {
val newPattern = convertIncompatiblePattern(pattern)
val useVarLen = needVarLengthSecondFraction && newPattern.contains('S')
isParsing: Boolean = false): DateTimeFormatter = {
val newPattern = convertIncompatiblePattern(pattern, isParsing)
val useVarLen = isParsing && newPattern.contains('S')
val key = (newPattern, locale, useVarLen)
var formatter = cache.getIfPresent(key)
if (formatter == null) {
@ -227,6 +227,12 @@ private object DateTimeFormatterHelper {
formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
}
final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
// SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
// to datetime, especially when they are mixed with other non-week-based ones
// The quarter fields will also be parsed strangely, e.g. when the pattern contains `yMd` and can
// be directly resolved then the `q` do check for whether the month is valid, but if the date
// fields is incomplete, e.g. `yM`, the checking will be bypassed.
final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F', 'q', 'Q')
final val unsupportedPatternLengths = {
// SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
// 2.4
@ -246,7 +252,7 @@ private object DateTimeFormatterHelper {
* @param pattern The input pattern.
* @return The pattern for new parser
*/
def convertIncompatiblePattern(pattern: String): String = {
def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
case (patternPart, index) =>
// Text can be quoted using single quotes, we only check the non-quote parts.
@ -255,7 +261,8 @@ private object DateTimeFormatterHelper {
(pattern + " ").split("'").zipWithIndex.map {
case (patternPart, index) =>
if (index % 2 == 0) {
for (c <- patternPart if unsupportedLetters.contains(c)) {
for (c <- patternPart if unsupportedLetters.contains(c) ||
(isParsing && unsupportedLettersForParsing.contains(c))) {
throw new IllegalArgumentException(s"Illegal pattern character: $c")
}
for (style <- unsupportedPatternLengths if patternPart.contains(style)) {

View file

@ -293,13 +293,13 @@ object TimestampFormatter {
zoneId: ZoneId,
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
isParsing: Boolean = false): TimestampFormatter = {
val pattern = format.getOrElse(defaultPattern)
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
} else {
val tf = new Iso8601TimestampFormatter(
pattern, zoneId, locale, legacyFormat, needVarLengthSecondFraction)
pattern, zoneId, locale, legacyFormat, isParsing)
tf.validatePatternString()
tf
}
@ -325,23 +325,23 @@ object TimestampFormatter {
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormat,
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, needVarLengthSecondFraction)
isParsing: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
}
def apply(
format: String,
zoneId: ZoneId,
legacyFormat: LegacyDateFormat,
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, needVarLengthSecondFraction)
isParsing: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, isParsing)
}
def apply(
format: String,
zoneId: ZoneId,
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
getFormatter(Some(format), zoneId, needVarLengthSecondFraction = needVarLengthSecondFraction)
isParsing: Boolean = false): TimestampFormatter = {
getFormatter(Some(format), zoneId, isParsing = isParsing)
}
def apply(zoneId: ZoneId): TimestampFormatter = {

View file

@ -1168,4 +1168,33 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkExceptionInExpression[ArithmeticException](
MillisToTimestamp(Literal(-92233720368547758L)), "long overflow")
}
test("Disable week-based date fields and quarter fields for parsing") {
def checkSparkUpgrade(c: Char): Unit = {
checkExceptionInExpression[SparkUpgradeException](
new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, "3.0")
checkExceptionInExpression[SparkUpgradeException](
new ParseToDate(Literal("1"), Literal(c.toString)).child, "3.0")
checkExceptionInExpression[SparkUpgradeException](
ToUnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
checkExceptionInExpression[SparkUpgradeException](
UnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
}
def checkNullify(c: Char): Unit = {
checkEvaluation(new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, null)
checkEvaluation(new ParseToDate(Literal("1"), Literal(c.toString)).child, null)
checkEvaluation(ToUnixTimestamp(Literal("1"), Literal(c.toString)), null)
checkEvaluation(UnixTimestamp(Literal("1"), Literal(c.toString)), null)
}
Seq('Y', 'W', 'w', 'E', 'u', 'F').foreach { l =>
checkSparkUpgrade(l)
}
Seq('q', 'Q').foreach { l =>
checkNullify(l)
}
}
}

View file

@ -17,7 +17,7 @@
package org.apache.spark.sql.catalyst.util
import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._
class DateTimeFormatterHelperSuite extends SparkFunSuite {
@ -40,6 +40,13 @@ class DateTimeFormatterHelperSuite extends SparkFunSuite {
val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
assert(e.getMessage === s"Illegal pattern character: $l")
}
unsupportedLettersForParsing.foreach { l =>
val e = intercept[IllegalArgumentException] {
convertIncompatiblePattern(s"$l", isParsing = true)
}
assert(e.getMessage === s"Illegal pattern character: $l")
assert(convertIncompatiblePattern(s"$l").nonEmpty)
}
unsupportedPatternLengths.foreach { style =>
val e1 = intercept[IllegalArgumentException] {
convertIncompatiblePattern(s"yyyy-MM-dd $style")

View file

@ -72,7 +72,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
val days = formatter.parse(date)
assert(date === formatter.format(days))
assert(date === formatter.format(daysToLocalDate(days)))
@ -106,7 +107,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
val date = formatter.format(days)
val parsed = formatter.parse(date)
assert(days === parsed)
@ -173,7 +175,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
assert(LocalDate.ofEpochDay(formatter.parse("1000-01-01")) === LocalDate.of(1000, 1, 1))
assert(formatter.format(LocalDate.of(1000, 1, 1)) === "1000-01-01")
assert(formatter.format(localDateToDays(LocalDate.of(1000, 1, 1))) === "1000-01-01")

View file

@ -48,7 +48,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
val formatter = TimestampFormatter(
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
getZoneId(zoneId),
needVarLengthSecondFraction = true)
isParsing = true)
val microsSinceEpoch = formatter.parse(localDate)
assert(microsSinceEpoch === expectedMicros(zoneId))
}
@ -73,7 +73,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
// Test only FAST_DATE_FORMAT because other legacy formats don't support formatting
// in microsecond precision.
LegacyDateFormats.FAST_DATE_FORMAT,
needVarLengthSecondFraction = false),
isParsing = false),
TimestampFormatter.getFractionFormatter(getZoneId(zoneId))).foreach { formatter =>
val timestamp = formatter.format(microsSinceEpoch)
assert(timestamp === expectedTimestamp(zoneId))
@ -98,7 +98,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
outstandingZoneIds.foreach { zoneId =>
val timestamp = TimestampFormatter(pattern, zoneId).format(micros)
val parsed = TimestampFormatter(
pattern, zoneId, needVarLengthSecondFraction = true).parse(timestamp)
pattern, zoneId, isParsing = true).parse(timestamp)
assert(micros === parsed)
}
}
@ -119,7 +119,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
outstandingZoneIds.foreach { zoneId =>
val pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSSSS"
val micros = TimestampFormatter(
pattern, zoneId, needVarLengthSecondFraction = true).parse(timestamp)
pattern, zoneId, isParsing = true).parse(timestamp)
val formatted = TimestampFormatter(pattern, zoneId).format(micros)
assert(timestamp === formatted)
}
@ -185,7 +185,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
test("parsing timestamp strings with various seconds fractions") {
outstandingZoneIds.foreach { zoneId =>
def check(pattern: String, input: String, reference: String): Unit = {
val formatter = TimestampFormatter(pattern, zoneId, needVarLengthSecondFraction = true)
val formatter = TimestampFormatter(pattern, zoneId, isParsing = true)
val expected = stringToTimestamp(UTF8String.fromString(reference), zoneId).get
val actual = formatter.parse(input)
assert(actual === expected)
@ -292,7 +292,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
zoneId,
TimestampFormatter.defaultLocale,
legacyFormat,
needVarLengthSecondFraction = false)
isParsing = false)
}.toSeq :+ TimestampFormatter.getFractionFormatter(zoneId)
formatters.foreach { formatter =>
assert(microsToInstant(formatter.parse("1000-01-01 01:02:03"))

View file

@ -134,7 +134,7 @@ object PartitioningUtils {
val timestampFormatter = TimestampFormatter(
timestampPartitionPattern,
zoneId,
needVarLengthSecondFraction = true)
isParsing = true)
// First, we need to parse every partition's path and see if we can find partition values.
val (partitionValues, optDiscoveredBasePaths) = paths.map { path =>
parsePartition(path, typeInference, basePaths, userSpecifiedDataTypes,

View file

@ -60,7 +60,7 @@ abstract class ParquetPartitionDiscoverySuite
val timeZoneId = ZoneId.systemDefault()
val df = DateFormatter(timeZoneId)
val tf = TimestampFormatter(
timestampPartitionPattern, timeZoneId, needVarLengthSecondFraction = true)
timestampPartitionPattern, timeZoneId, isParsing = true)
protected override def beforeAll(): Unit = {
super.beforeAll()