[SPARK-29679][SQL] Make interval type comparable and orderable

### What changes were proposed in this pull request?

interval type support >, >=, <, <=, =, <=>, order by, min,max..

### Why are the changes needed?

Part of SPARK-27764 Feature Parity between PostgreSQL and Spark
### Does this PR introduce any user-facing change?

yes, we now support compare intervals

### How was this patch tested?

add ut

Closes #26337 from yaooqinn/SPARK-29679.

Authored-by: Kent Yao <yaooqinn@hotmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Kent Yao 2019-11-08 22:45:11 +08:00 committed by Wenchen Fan
parent e7f7990bc3
commit e026412d9c
8 changed files with 258 additions and 1 deletions

View file

@ -29,7 +29,7 @@ import static org.apache.spark.sql.catalyst.util.DateTimeConstants.*;
/**
* The internal representation of interval type.
*/
public final class CalendarInterval implements Serializable {
public final class CalendarInterval implements Serializable, Comparable<CalendarInterval> {
public final int months;
public final int days;
public final long microseconds;
@ -55,6 +55,29 @@ public final class CalendarInterval implements Serializable {
return Objects.hash(months, days, microseconds);
}
@Override
public int compareTo(CalendarInterval that) {
long thisAdjustDays =
this.microseconds / MICROS_PER_DAY + this.days + this.months * DAYS_PER_MONTH;
long thatAdjustDays =
that.microseconds / MICROS_PER_DAY + that.days + that.months * DAYS_PER_MONTH;
long daysDiff = thisAdjustDays - thatAdjustDays;
if (daysDiff == 0) {
long msDiff = (this.microseconds % MICROS_PER_DAY) - (that.microseconds % MICROS_PER_DAY);
if (msDiff == 0) {
return 0;
} else if (msDiff > 0) {
return 1;
} else {
return -1;
}
} else if (daysDiff > 0){
return 1;
} else {
return -1;
}
}
@Override
public String toString() {
if (months == 0 && days == 0 && microseconds == 0) {

View file

@ -855,6 +855,11 @@ object TypeCoercion {
case Divide(l @ CalendarIntervalType(), r @ NumericType()) =>
DivideInterval(l, r)
case b @ BinaryOperator(l @ CalendarIntervalType(), r @ NullType()) =>
b.withNewChildren(Seq(l, Cast(r, CalendarIntervalType)))
case b @ BinaryOperator(l @ NullType(), r @ CalendarIntervalType()) =>
b.withNewChildren(Seq(Cast(l, CalendarIntervalType), r))
case Add(l @ DateType(), r @ IntegerType()) => DateAdd(l, r)
case Add(l @ IntegerType(), r @ DateType()) => DateAdd(r, l)
case Subtract(l @ DateType(), r @ IntegerType()) => DateSub(l, r)

View file

@ -629,6 +629,7 @@ class CodegenContext extends Logging {
// use c1 - c2 may overflow
case dt: DataType if isPrimitiveType(dt) => s"($c1 > $c2 ? 1 : $c1 < $c2 ? -1 : 0)"
case BinaryType => s"org.apache.spark.sql.catalyst.util.TypeUtils.compareBinary($c1, $c2)"
case CalendarIntervalType => s"$c1.compareTo($c2)"
case NullType => "0"
case array: ArrayType =>
val elementType = array.elementType

View file

@ -91,6 +91,7 @@ object RowOrdering {
def isOrderable(dataType: DataType): Boolean = dataType match {
case NullType => true
case dt: AtomicType => true
case CalendarIntervalType => true
case struct: StructType => struct.fields.forall(f => isOrderable(f.dataType))
case array: ArrayType => isOrderable(array.elementType)
case udt: UserDefinedType[_] => isOrderable(udt.sqlType)

View file

@ -71,6 +71,7 @@ object TypeUtils {
def getInterpretedOrdering(t: DataType): Ordering[Any] = {
t match {
case i: AtomicType => i.ordering.asInstanceOf[Ordering[Any]]
case c: CalendarIntervalType => c.ordering.asInstanceOf[Ordering[Any]]
case a: ArrayType => a.interpretedOrdering.asInstanceOf[Ordering[Any]]
case s: StructType => s.interpretedOrdering.asInstanceOf[Ordering[Any]]
case udt: UserDefinedType[_] => getInterpretedOrdering(udt.sqlType)

View file

@ -18,6 +18,7 @@
package org.apache.spark.sql.types
import org.apache.spark.annotation.Stable
import org.apache.spark.unsafe.types.CalendarInterval
/**
* The data type representing calendar time intervals. The calendar time interval is stored
@ -36,6 +37,8 @@ class CalendarIntervalType private() extends DataType {
override def simpleString: String = "interval"
val ordering: Ordering[CalendarInterval] = Ordering[CalendarInterval]
private[spark] override def asNullable: CalendarIntervalType = this
}

View file

@ -0,0 +1,43 @@
-- test for intervals
-- greater than or equal
select interval '1 day' > interval '23 hour';
select interval '-1 day' >= interval '-23 hour';
select interval '-1 day' > null;
select null > interval '-1 day';
-- less than or equal
select interval '1 minutes' < interval '1 hour';
select interval '-1 day' <= interval '-23 hour';
-- equal
select interval '1 year' = interval '360 days';
select interval '1 year 2 month' = interval '420 days';
select interval '1 year' = interval '365 days';
select interval '1 month' = interval '30 days';
select interval '1 minutes' = interval '1 hour';
select interval '1 minutes' = null;
select null = interval '-1 day';
-- null safe equal
select interval '1 minutes' <=> null;
select null <=> interval '1 minutes';
-- complex interval representation
select INTERVAL '9 years 1 months -1 weeks -4 days -10 hours -46 minutes' > interval '1 minutes';
-- ordering
select cast(v as interval) i from VALUES ('1 seconds'), ('4 seconds'), ('3 seconds') t(v) order by i;
-- unlimited days
select interval '1 month 120 days' > interval '2 month';
select interval '1 month 30 days' = interval '2 month';
-- unlimited microseconds
select interval '1 month 29 days 40 hours' > interval '2 month';
-- max
select max(cast(v as interval)) from VALUES ('1 seconds'), ('4 seconds'), ('3 seconds') t(v);
-- min
select min(cast(v as interval)) from VALUES ('1 seconds'), ('4 seconds'), ('3 seconds') t(v);

View file

@ -0,0 +1,180 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 22
-- !query 0
select interval '1 day' > interval '23 hour'
-- !query 0 schema
struct<(1 days > 23 hours):boolean>
-- !query 0 output
true
-- !query 1
select interval '-1 day' >= interval '-23 hour'
-- !query 1 schema
struct<(-1 days >= -23 hours):boolean>
-- !query 1 output
false
-- !query 2
select interval '-1 day' > null
-- !query 2 schema
struct<(-1 days > CAST(NULL AS INTERVAL)):boolean>
-- !query 2 output
NULL
-- !query 3
select null > interval '-1 day'
-- !query 3 schema
struct<(CAST(NULL AS INTERVAL) > -1 days):boolean>
-- !query 3 output
NULL
-- !query 4
select interval '1 minutes' < interval '1 hour'
-- !query 4 schema
struct<(1 minutes < 1 hours):boolean>
-- !query 4 output
true
-- !query 5
select interval '-1 day' <= interval '-23 hour'
-- !query 5 schema
struct<(-1 days <= -23 hours):boolean>
-- !query 5 output
true
-- !query 6
select interval '1 year' = interval '360 days'
-- !query 6 schema
struct<(1 years = 360 days):boolean>
-- !query 6 output
true
-- !query 7
select interval '1 year 2 month' = interval '420 days'
-- !query 7 schema
struct<(1 years 2 months = 420 days):boolean>
-- !query 7 output
true
-- !query 8
select interval '1 year' = interval '365 days'
-- !query 8 schema
struct<(1 years = 365 days):boolean>
-- !query 8 output
false
-- !query 9
select interval '1 month' = interval '30 days'
-- !query 9 schema
struct<(1 months = 30 days):boolean>
-- !query 9 output
true
-- !query 10
select interval '1 minutes' = interval '1 hour'
-- !query 10 schema
struct<(1 minutes = 1 hours):boolean>
-- !query 10 output
false
-- !query 11
select interval '1 minutes' = null
-- !query 11 schema
struct<(1 minutes = CAST(NULL AS INTERVAL)):boolean>
-- !query 11 output
NULL
-- !query 12
select null = interval '-1 day'
-- !query 12 schema
struct<(CAST(NULL AS INTERVAL) = -1 days):boolean>
-- !query 12 output
NULL
-- !query 13
select interval '1 minutes' <=> null
-- !query 13 schema
struct<(1 minutes <=> CAST(NULL AS INTERVAL)):boolean>
-- !query 13 output
false
-- !query 14
select null <=> interval '1 minutes'
-- !query 14 schema
struct<(CAST(NULL AS INTERVAL) <=> 1 minutes):boolean>
-- !query 14 output
false
-- !query 15
select INTERVAL '9 years 1 months -1 weeks -4 days -10 hours -46 minutes' > interval '1 minutes'
-- !query 15 schema
struct<(9 years 1 months -11 days -10 hours -46 minutes > 1 minutes):boolean>
-- !query 15 output
true
-- !query 16
select cast(v as interval) i from VALUES ('1 seconds'), ('4 seconds'), ('3 seconds') t(v) order by i
-- !query 16 schema
struct<i:interval>
-- !query 16 output
1 seconds
3 seconds
4 seconds
-- !query 17
select interval '1 month 120 days' > interval '2 month'
-- !query 17 schema
struct<(1 months 120 days > 2 months):boolean>
-- !query 17 output
true
-- !query 18
select interval '1 month 30 days' = interval '2 month'
-- !query 18 schema
struct<(1 months 30 days = 2 months):boolean>
-- !query 18 output
true
-- !query 19
select interval '1 month 29 days 40 hours' > interval '2 month'
-- !query 19 schema
struct<(1 months 29 days 40 hours > 2 months):boolean>
-- !query 19 output
true
-- !query 20
select max(cast(v as interval)) from VALUES ('1 seconds'), ('4 seconds'), ('3 seconds') t(v)
-- !query 20 schema
struct<max(CAST(v AS INTERVAL)):interval>
-- !query 20 output
4 seconds
-- !query 21
select min(cast(v as interval)) from VALUES ('1 seconds'), ('4 seconds'), ('3 seconds') t(v)
-- !query 21 schema
struct<min(CAST(v AS INTERVAL)):interval>
-- !query 21 output
1 seconds