[SPARK-31423][SQL] Fix rebasing of not-existed dates

### What changes were proposed in this pull request?
In the PR, I propose to change rebasing of not-existed dates in the hybrid calendar (Julian + Gregorian since 1582-10-15) in the range (1582-10-04, 1582-10-15). Not existed dates from the range are shifted to the first valid date in the hybrid calendar - 1582-10-15. The changes affect only `rebaseGregorianToJulianDays()` because reverse rebasing from the hybrid dates to Proleptic Gregorian dates does not have such problem.

### Why are the changes needed?
Currently, not-existed dates are shifted by standard difference between Julian and Gregorian calendar on 1582-10-04, for example 1582-10-14 -> 1582-10-24. That's contradict to shifting not existed dates in other cases, for example:
```
scala> sql("select date'1990-9-31'").show
+-----------------+
|DATE '1990-10-01'|
+-----------------+
|       1990-10-01|
+-----------------+
```

### Does this PR introduce any user-facing change?
Yes, this impacts on conversion of Spark SQL `DATE` values to external dates based on non-Proleptic Gregorian calendar. For example, while saving the 1582-10-14 date to ORC files, it will be shifted to the next valid date 1582-10-15.

### How was this patch tested?
- Added tests to `RebaseDateTimeSuite` and to `OrcSourceSuite`
- By existing test suites `DateTimeUtilsSuite`, `DateFunctionsSuite`, `DateExpressionsSuite`, `CollectionExpressionsSuite`, `ParquetIOSuite`.

Closes #28225 from MaxGekk/fix-not-exist-dates.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Max Gekk 2020-04-15 16:33:56 +00:00 committed by Wenchen Fan
parent 7699f765f5
commit 2b10d70bad
3 changed files with 39 additions and 7 deletions

View file

@ -131,7 +131,8 @@ object RebaseDateTime {
// The differences in days between Proleptic Gregorian and Julian dates.
// The diff at the index `i` is applicable for all days in the date interval:
// [gregJulianDiffSwitchDay(i), gregJulianDiffSwitchDay(i+1))
private val gregJulianDiffs = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0)
private val gregJulianDiffs = Array(
-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
// The sorted days in Proleptic Gregorian calendar when difference in days between
// Proleptic Gregorian and Julian was changed.
// The starting point is the `0001-01-01` (-719162 days since the epoch in
@ -139,13 +140,17 @@ object RebaseDateTime {
// Rebasing switch days and diffs `gregJulianDiffSwitchDay` and `gregJulianDiffs`
// was generated by the `localRebaseGregorianToJulianDays` function.
private val gregJulianDiffSwitchDay = Array(
-719162, -682944, -646420, -609896, -536847, -500323, -463799,
-390750, -354226, -317702, -244653, -208129, -171605, -141427)
-719162, -682944, -646420, -609896, -536847, -500323, -463799, -390750,
-354226, -317702, -244653, -208129, -171605, -141436, -141435, -141434,
-141433, -141432, -141431, -141430, -141429, -141428, -141427)
// The first days of Common Era (CE) which is mapped to the '0001-01-01' date
// in Proleptic Gregorian calendar.
private final val gregorianCommonEraStartDay = gregJulianDiffSwitchDay(0)
private final val gregorianStartDay = LocalDate.of(1582, 10, 15)
private final val julianEndDay = LocalDate.of(1582, 10, 4)
/**
* Converts the given number of days since the epoch day 1970-01-01 to a local date in Proleptic
* Gregorian calendar, interprets the result as a local date in Julian calendar, and takes the
@ -165,7 +170,10 @@ object RebaseDateTime {
* @return The rebased number of days in Julian calendar.
*/
private[sql] def localRebaseGregorianToJulianDays(days: Int): Int = {
val localDate = LocalDate.ofEpochDay(days)
var localDate = LocalDate.ofEpochDay(days)
if (localDate.isAfter(julianEndDay) && localDate.isBefore(gregorianStartDay)) {
localDate = gregorianStartDay
}
val utcCal = new Calendar.Builder()
// `gregory` is a hybrid calendar that supports both
// the Julian and Gregorian calendar systems

View file

@ -364,4 +364,26 @@ class RebaseDateTimeSuite extends SparkFunSuite with Matchers with SQLHelper {
}
}
}
test("rebase not-existed dates in the hybrid calendar") {
outstandingZoneIds.foreach { zid =>
withDefaultTimeZone(zid) {
Seq(
"1582-10-04" -> "1582-10-04",
"1582-10-05" -> "1582-10-15", "1582-10-06" -> "1582-10-15", "1582-10-07" -> "1582-10-15",
"1582-10-08" -> "1582-10-15", "1582-10-09" -> "1582-10-15", "1582-10-11" -> "1582-10-15",
"1582-10-12" -> "1582-10-15", "1582-10-13" -> "1582-10-15", "1582-10-14" -> "1582-10-15",
"1582-10-15" -> "1582-10-15").foreach { case (hybridDate, gregDate) =>
withClue(s"tz = ${zid.getId} hybrid date = $hybridDate greg date = $gregDate") {
val date = Date.valueOf(gregDate)
val hybridDays = fromJavaDateLegacy(date)
val gregorianDays = localDateToDays(LocalDate.parse(hybridDate))
assert(localRebaseGregorianToJulianDays(gregorianDays) === hybridDays)
assert(rebaseGregorianToJulianDays(gregorianDays) === hybridDays)
}
}
}
}
}
}

View file

@ -493,17 +493,19 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
}
}
test("SPARK-31238: rebasing dates in write") {
test("SPARK-31238, SPARK-31423: rebasing dates in write") {
withTempPath { dir =>
val path = dir.getAbsolutePath
Seq("1001-01-01").toDF("dateS")
Seq("1001-01-01", "1582-10-10").toDF("dateS")
.select($"dateS".cast("date").as("date"))
.write
.orc(path)
Seq(false, true).foreach { vectorized =>
withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
checkAnswer(spark.read.orc(path), Row(Date.valueOf("1001-01-01")))
checkAnswer(
spark.read.orc(path),
Seq(Row(Date.valueOf("1001-01-01")), Row(Date.valueOf("1582-10-15"))))
}
}
}