[SPARK-21344][SQL] BinaryType comparison does signed byte array comparison

## What changes were proposed in this pull request?

This PR fixes a wrong comparison for `BinaryType`. This PR enables unsigned comparison and unsigned prefix generation for an array for `BinaryType`. Previous implementations uses signed operations.

## How was this patch tested?

Added a test suite in `OrderingSuite`.

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #18571 from kiszk/SPARK-21344.
This commit is contained in:
Kazuaki Ishizaki 2017-07-14 20:16:04 -07:00 committed by gatorsmile
parent 2d968a07d2
commit ac5d5d7959
6 changed files with 60 additions and 3 deletions

View file

@ -44,7 +44,7 @@ public final class ByteArray {
final int minLen = Math.min(bytes.length, 8);
long p = 0;
for (int i = 0; i < minLen; ++i) {
p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i))
p |= ((long) Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i) & 0xff)
<< (56 - 8 * i);
}
return p;

View file

@ -63,7 +63,9 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
def compareBinary(x: Array[Byte], y: Array[Byte]): Int = {
for (i <- 0 until x.length; if i < y.length) {
val res = x(i).compare(y(i))
val v1 = x(i) & 0xff
val v2 = y(i) & 0xff
val res = v1 - v2
if (res != 0) return res
}
x.length - y.length
@ -80,6 +82,19 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
(prefixComparisonResult > 0 && compareBinary(x, y) > 0))
}
val binaryRegressionTests = Seq(
(Array[Byte](1), Array[Byte](-1)),
(Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, -1)),
(Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, -1)),
(Array[Byte](1), Array[Byte](1, 1, 1, 1)),
(Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1)),
(Array[Byte](-1), Array[Byte](-1, -1, -1, -1)),
(Array[Byte](-1, -1, -1, -1, -1), Array[Byte](-1, -1, -1, -1, -1, -1, -1, -1, -1))
)
binaryRegressionTests.foreach { case (b1, b2) =>
testPrefixComparison(b1, b2)
}
// scalastyle:off
val regressionTests = Table(
("s1", "s2"),

View file

@ -70,7 +70,9 @@ object TypeUtils {
def compareBinary(x: Array[Byte], y: Array[Byte]): Int = {
for (i <- 0 until x.length; if i < y.length) {
val res = x(i).compareTo(y(i))
val v1 = x(i) & 0xff
val v2 = y(i) & 0xff
val res = v1 - v2
if (res != 0) return res
}
x.length - y.length

View file

@ -137,4 +137,23 @@ class OrderingSuite extends SparkFunSuite with ExpressionEvalHelper {
// verify that we can support up to 5000 ordering comparisons, which should be sufficient
GenerateOrdering.generate(Array.fill(5000)(sortOrder))
}
test("SPARK-21344: BinaryType comparison does signed byte array comparison") {
val data = Seq(
(Array[Byte](1), Array[Byte](-1)),
(Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, -1)),
(Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, -1))
)
data.foreach { case (b1, b2) =>
val rowOrdering = InterpretedOrdering.forSchema(Seq(BinaryType))
val genOrdering = GenerateOrdering.generate(
BoundReference(0, BinaryType, nullable = true).asc :: Nil)
val rowType = StructType(StructField("b", BinaryType, nullable = true) :: Nil)
val toCatalyst = CatalystTypeConverters.createToCatalystConverter(rowType)
val rowB1 = toCatalyst(Row(b1)).asInstanceOf[InternalRow]
val rowB2 = toCatalyst(Row(b2)).asInstanceOf[InternalRow]
assert(rowOrdering.compare(rowB1, rowB2) < 0)
assert(genOrdering.compare(rowB1, rowB2) < 0)
}
}
}

View file

@ -0,0 +1,3 @@
-- binary type
select x'00' < x'0f';
select x'00' < x'ff';

View file

@ -0,0 +1,18 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 2
-- !query 0
select x'00' < x'0f'
-- !query 0 schema
struct<(X'00' < X'0F'):boolean>
-- !query 0 output
true
-- !query 1
select x'00' < x'ff'
-- !query 1 schema
struct<(X'00' < X'FF'):boolean>
-- !query 1 output
true