[SPARK-21344][SQL] BinaryType comparison does signed byte array comparison
## What changes were proposed in this pull request? This PR fixes a wrong comparison for `BinaryType`. This PR enables unsigned comparison and unsigned prefix generation for an array for `BinaryType`. Previous implementations uses signed operations. ## How was this patch tested? Added a test suite in `OrderingSuite`. Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com> Closes #18571 from kiszk/SPARK-21344.
This commit is contained in:
parent
2d968a07d2
commit
ac5d5d7959
|
@ -44,7 +44,7 @@ public final class ByteArray {
|
|||
final int minLen = Math.min(bytes.length, 8);
|
||||
long p = 0;
|
||||
for (int i = 0; i < minLen; ++i) {
|
||||
p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i))
|
||||
p |= ((long) Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i) & 0xff)
|
||||
<< (56 - 8 * i);
|
||||
}
|
||||
return p;
|
||||
|
|
|
@ -63,7 +63,9 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
|
|||
|
||||
def compareBinary(x: Array[Byte], y: Array[Byte]): Int = {
|
||||
for (i <- 0 until x.length; if i < y.length) {
|
||||
val res = x(i).compare(y(i))
|
||||
val v1 = x(i) & 0xff
|
||||
val v2 = y(i) & 0xff
|
||||
val res = v1 - v2
|
||||
if (res != 0) return res
|
||||
}
|
||||
x.length - y.length
|
||||
|
@ -80,6 +82,19 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks {
|
|||
(prefixComparisonResult > 0 && compareBinary(x, y) > 0))
|
||||
}
|
||||
|
||||
val binaryRegressionTests = Seq(
|
||||
(Array[Byte](1), Array[Byte](-1)),
|
||||
(Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, -1)),
|
||||
(Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, -1)),
|
||||
(Array[Byte](1), Array[Byte](1, 1, 1, 1)),
|
||||
(Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1)),
|
||||
(Array[Byte](-1), Array[Byte](-1, -1, -1, -1)),
|
||||
(Array[Byte](-1, -1, -1, -1, -1), Array[Byte](-1, -1, -1, -1, -1, -1, -1, -1, -1))
|
||||
)
|
||||
binaryRegressionTests.foreach { case (b1, b2) =>
|
||||
testPrefixComparison(b1, b2)
|
||||
}
|
||||
|
||||
// scalastyle:off
|
||||
val regressionTests = Table(
|
||||
("s1", "s2"),
|
||||
|
|
|
@ -70,7 +70,9 @@ object TypeUtils {
|
|||
|
||||
def compareBinary(x: Array[Byte], y: Array[Byte]): Int = {
|
||||
for (i <- 0 until x.length; if i < y.length) {
|
||||
val res = x(i).compareTo(y(i))
|
||||
val v1 = x(i) & 0xff
|
||||
val v2 = y(i) & 0xff
|
||||
val res = v1 - v2
|
||||
if (res != 0) return res
|
||||
}
|
||||
x.length - y.length
|
||||
|
|
|
@ -137,4 +137,23 @@ class OrderingSuite extends SparkFunSuite with ExpressionEvalHelper {
|
|||
// verify that we can support up to 5000 ordering comparisons, which should be sufficient
|
||||
GenerateOrdering.generate(Array.fill(5000)(sortOrder))
|
||||
}
|
||||
|
||||
test("SPARK-21344: BinaryType comparison does signed byte array comparison") {
|
||||
val data = Seq(
|
||||
(Array[Byte](1), Array[Byte](-1)),
|
||||
(Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, -1)),
|
||||
(Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, -1))
|
||||
)
|
||||
data.foreach { case (b1, b2) =>
|
||||
val rowOrdering = InterpretedOrdering.forSchema(Seq(BinaryType))
|
||||
val genOrdering = GenerateOrdering.generate(
|
||||
BoundReference(0, BinaryType, nullable = true).asc :: Nil)
|
||||
val rowType = StructType(StructField("b", BinaryType, nullable = true) :: Nil)
|
||||
val toCatalyst = CatalystTypeConverters.createToCatalystConverter(rowType)
|
||||
val rowB1 = toCatalyst(Row(b1)).asInstanceOf[InternalRow]
|
||||
val rowB2 = toCatalyst(Row(b2)).asInstanceOf[InternalRow]
|
||||
assert(rowOrdering.compare(rowB1, rowB2) < 0)
|
||||
assert(genOrdering.compare(rowB1, rowB2) < 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
-- binary type
|
||||
select x'00' < x'0f';
|
||||
select x'00' < x'ff';
|
|
@ -0,0 +1,18 @@
|
|||
-- Automatically generated by SQLQueryTestSuite
|
||||
-- Number of queries: 2
|
||||
|
||||
|
||||
-- !query 0
|
||||
select x'00' < x'0f'
|
||||
-- !query 0 schema
|
||||
struct<(X'00' < X'0F'):boolean>
|
||||
-- !query 0 output
|
||||
true
|
||||
|
||||
|
||||
-- !query 1
|
||||
select x'00' < x'ff'
|
||||
-- !query 1 schema
|
||||
struct<(X'00' < X'FF'):boolean>
|
||||
-- !query 1 output
|
||||
true
|
Loading…
Reference in a new issue