[SPARK-36501][ML] Fix random col names in LSHModel.approxSimilarityJoin
### What changes were proposed in this pull request? Random.nextString() can include characters that are not valid in identifiers or likely to be buggy, e.g. non-printing characters, ".", "`". Instead use a utility that will always generate valid alphanumeric identifiers ### Why are the changes needed? To deflake BucketedRandomProjectionLSHSuite and avoid similar failures that could be encountered by users. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Ran org.apache.spark.ml.feature.BucketedRandomProjectionLSHSuite Closes #33730 from timarmstrong/flaky-lsb. Authored-by: Tim Armstrong <tim.armstrong@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
This commit is contained in:
parent
b8e2186fe1
commit
886dbe01cd
|
@ -17,8 +17,6 @@
|
|||
|
||||
package org.apache.spark.ml.feature
|
||||
|
||||
import scala.util.Random
|
||||
|
||||
import org.apache.spark.ml.{Estimator, Model}
|
||||
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
|
||||
import org.apache.spark.ml.param.{IntParam, ParamValidators}
|
||||
|
@ -280,7 +278,7 @@ private[ml] abstract class LSHModel[T <: LSHModel[T]]
|
|||
val explodedB = if (datasetA != datasetB) {
|
||||
processDataset(datasetB, rightColName, explodeCols)
|
||||
} else {
|
||||
val recreatedB = recreateCol(datasetB, $(inputCol), s"${$(inputCol)}#${Random.nextString(5)}")
|
||||
val recreatedB = recreateCol(datasetB, $(inputCol), Identifiable.randomUID(inputCol.name))
|
||||
processDataset(recreatedB, rightColName, explodeCols)
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue