[SPARK-30847][SQL] Take productPrefix into account in MurmurHash3.productHash
### What changes were proposed in this pull request? This PR proposes to port Scala's bugfix https://github.com/scala/scala/pull/7693 (Scala 2.13) to address https://github.com/scala/bug/issues/10495 issue. In short, it is possible for different product instances having the same children to have the same hash. See: ```scala scala> spark.range(1).selectExpr("id - 1").queryExecution.analyzed.semanticHash() res0: Int = -565572825 scala> spark.range(1).selectExpr("id + 1").queryExecution.analyzed.semanticHash() res1: Int = -565572825 ``` ### Why are the changes needed? It was found during the review of https://github.com/apache/spark/pull/27565. We should better produce different hash for different objects. ### Does this PR introduce any user-facing change? No, it's not identified. Possibly performance related issue. ### How was this patch tested? Manually tested, and unittest was added. Closes #27601 from HyukjinKwon/SPARK-30847. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
5866bc77d7
commit
9618806f44
|
@ -114,7 +114,30 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
|
|||
|
||||
lazy val containsChild: Set[TreeNode[_]] = children.toSet
|
||||
|
||||
private lazy val _hashCode: Int = scala.util.hashing.MurmurHash3.productHash(this)
|
||||
// Copied from Scala 2.13.1
|
||||
// github.com/scala/scala/blob/v2.13.1/src/library/scala/util/hashing/MurmurHash3.scala#L56-L73
|
||||
// to prevent the issue https://github.com/scala/bug/issues/10495
|
||||
// TODO(SPARK-30848): Remove this once we drop Scala 2.12.
|
||||
private final def productHash(x: Product, seed: Int, ignorePrefix: Boolean = false): Int = {
|
||||
val arr = x.productArity
|
||||
// Case objects have the hashCode inlined directly into the
|
||||
// synthetic hashCode method, but this method should still give
|
||||
// a correct result if passed a case object.
|
||||
if (arr == 0) {
|
||||
x.productPrefix.hashCode
|
||||
} else {
|
||||
var h = seed
|
||||
if (!ignorePrefix) h = scala.util.hashing.MurmurHash3.mix(h, x.productPrefix.hashCode)
|
||||
var i = 0
|
||||
while (i < arr) {
|
||||
h = scala.util.hashing.MurmurHash3.mix(h, x.productElement(i).##)
|
||||
i += 1
|
||||
}
|
||||
scala.util.hashing.MurmurHash3.finalizeHash(h, arr)
|
||||
}
|
||||
}
|
||||
|
||||
private lazy val _hashCode: Int = productHash(this, scala.util.hashing.MurmurHash3.productSeed)
|
||||
override def hashCode(): Int = _hashCode
|
||||
|
||||
/**
|
||||
|
|
|
@ -79,4 +79,11 @@ class CanonicalizeSuite extends SparkFunSuite {
|
|||
0, Some("b2"))
|
||||
assert(fieldB1.semanticEquals(fieldB2))
|
||||
}
|
||||
|
||||
test("SPARK-30847: Take productPrefix into account in MurmurHash3.productHash") {
|
||||
val range = Range(1, 1, 1, 1)
|
||||
val addExpr = Add(range.output.head, Literal(1))
|
||||
val subExpr = Subtract(range.output.head, Literal(1))
|
||||
assert(addExpr.canonicalized.hashCode() != subExpr.canonicalized.hashCode())
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue