[SPARK-23947][SQL] Add hashUTF8String convenience method to hasher classes

## What changes were proposed in this pull request?

Add `hashUTF8String()` to the hasher classes to allow Spark SQL codegen to generate cleaner code for hashing `UTF8String`s. No change in behavior otherwise.

Although with the introduction of SPARK-10399, the code size for hashing `UTF8String` is already smaller, it's still good to extract a separate function in the hasher classes so that the generated code can stay clean.

## How was this patch tested?

Existing tests.

Author: Kris Mok <kris.mok@databricks.com>

Closes #21016 from rednaxelafx/hashutf8.
This commit is contained in:
Kris Mok 2018-04-09 21:07:28 -07:00 committed by gatorsmile
parent 61b724724c
commit f94f3624ea
4 changed files with 18 additions and 5 deletions

View file

@ -18,6 +18,7 @@
package org.apache.spark.sql.catalyst.expressions;
import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.types.UTF8String;
/**
* Simulates Hive's hashing function from Hive v1.2.1
@ -51,4 +52,8 @@ public class HiveHasher {
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes) {
return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes));
}
public static int hashUTF8String(UTF8String str) {
return hashUnsafeBytesBlock(str.getMemoryBlock());
}
}

View file

@ -20,6 +20,7 @@ package org.apache.spark.unsafe.hash;
import com.google.common.primitives.Ints;
import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.types.UTF8String;
/**
* 32-bit Murmur3 hasher. This is based on Guava's Murmur3_32HashFunction.
@ -82,6 +83,10 @@ public final class Murmur3_x86_32 {
return fmix(h1, lengthInBytes);
}
public static int hashUTF8String(UTF8String str, int seed) {
return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
}
public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) {
return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, lengthInBytes), seed);
}
@ -91,7 +96,7 @@ public final class Murmur3_x86_32 {
}
public static int hashUnsafeBytes2Block(MemoryBlock base, int seed) {
// This is compatible with original and another implementations.
// This is compatible with original and other implementations.
// Use this method for new components after Spark 2.3.
int lengthInBytes = Ints.checkedCast(base.size());
assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative";

View file

@ -17,6 +17,7 @@
package org.apache.spark.sql.catalyst.expressions;
import org.apache.spark.unsafe.memory.MemoryBlock;
import org.apache.spark.unsafe.types.UTF8String;
// scalastyle: off
/**
@ -107,6 +108,10 @@ public final class XXH64 {
return fmix(hash);
}
public static long hashUTF8String(UTF8String str, long seed) {
return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
}
public static long hashUnsafeBytes(Object base, long offset, int length, long seed) {
return hashUnsafeBytesBlock(MemoryBlock.allocateFromObject(base, offset, length), seed);
}

View file

@ -361,8 +361,7 @@ abstract class HashExpression[E] extends Expression {
}
protected def genHashString(input: String, result: String): String = {
val mb = s"$input.getMemoryBlock()"
s"$result = $hasherClassName.hashUnsafeBytesBlock($mb, $result);"
s"$result = $hasherClassName.hashUTF8String($input, $result);"
}
protected def genHashForMap(
@ -725,8 +724,7 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
"""
override protected def genHashString(input: String, result: String): String = {
val mb = s"$input.getMemoryBlock()"
s"$result = $hasherClassName.hashUnsafeBytesBlock($mb);"
s"$result = $hasherClassName.hashUTF8String($input);"
}
override protected def genHashForArray(