[SPARK-16904][SQL] Removal of Hive Built-in Hash Functions and TestHiveFunctionRegistry

### What changes were proposed in this pull request?

Currently, the Hive built-in `hash` function is not being used in Spark since Spark 2.0. The public interface does not allow users to unregister the Spark built-in functions. Thus, users will never use Hive's built-in `hash` function.

The only exception here is `TestHiveFunctionRegistry`, which allows users to unregister the built-in functions. Thus, we can load Hive's hash function in the test cases. If we disable it, 10+ test cases will fail because the results are different from the Hive golden answer files.

This PR is to remove `hash` from the list of `hiveFunctions` in `HiveSessionCatalog`. It will also remove `TestHiveFunctionRegistry`. This removal makes us easier to remove `TestHiveSessionState` in the future.
### How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #14498 from gatorsmile/removeHash.
This commit is contained in:
gatorsmile 2016-11-07 01:16:37 -08:00 committed by Reynold Xin
parent 9db06c442c
commit 57626a5570
3 changed files with 20 additions and 50 deletions

View file

@ -57,8 +57,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5)
// Enable in-memory partition pruning for testing purposes
TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true)
// Use Hive hash expression instead of the native one
TestHive.sessionState.functionRegistry.unregisterFunction("hash")
// Ensures that the plans generation use metastore relation and not OrcRelation
// Was done because SqlBuilder does not work with plans having logical relation
TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, false)
@ -76,7 +74,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
TestHive.setConf(HiveUtils.CONVERT_METASTORE_ORC, originalConvertMetastoreOrc)
TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled)
TestHive.sessionState.functionRegistry.restore()
// For debugging dump some statistics about how much time was spent in various optimizer rules
logWarning(RuleExecutor.dumpTimeSpent())
@ -581,7 +578,26 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"auto_join6",
"auto_join7",
"auto_join8",
"auto_join9"
"auto_join9",
// These tests are based on the Hive's hash function, which is different from Spark
"auto_join19",
"auto_join22",
"auto_join25",
"auto_join26",
"auto_join27",
"auto_join28",
"auto_join30",
"auto_join31",
"auto_join_nulls",
"auto_join_reordering_values",
"correlationoptimizer1",
"correlationoptimizer2",
"correlationoptimizer3",
"correlationoptimizer4",
"multiMapJoin1",
"orc_dictionary_threshold",
"udf_hash"
)
/**
@ -601,16 +617,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"annotate_stats_part",
"annotate_stats_table",
"annotate_stats_union",
"auto_join19",
"auto_join22",
"auto_join25",
"auto_join26",
"auto_join27",
"auto_join28",
"auto_join30",
"auto_join31",
"auto_join_nulls",
"auto_join_reordering_values",
"binary_constant",
"binarysortable_1",
"cast1",
@ -623,15 +629,11 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"compute_stats_long",
"compute_stats_string",
"convert_enum_to_string",
"correlationoptimizer1",
"correlationoptimizer10",
"correlationoptimizer11",
"correlationoptimizer13",
"correlationoptimizer14",
"correlationoptimizer15",
"correlationoptimizer2",
"correlationoptimizer3",
"correlationoptimizer4",
"correlationoptimizer6",
"correlationoptimizer7",
"correlationoptimizer8",
@ -871,7 +873,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"merge2",
"merge4",
"mergejoins",
"multiMapJoin1",
"multiMapJoin2",
"multi_insert_gby",
"multi_insert_gby3",
@ -893,7 +894,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"nullinput2",
"nullscript",
"optional_outer",
"orc_dictionary_threshold",
"order",
"order2",
"outer_join_ppr",
@ -1026,7 +1026,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"udf_from_unixtime",
"udf_greaterthan",
"udf_greaterthanorequal",
"udf_hash",
"udf_hex",
"udf_if",
"udf_index",

View file

@ -233,7 +233,6 @@ private[sql] class HiveSessionCatalog(
// in_file, index, matchpath, ngrams, noop, noopstreaming, noopwithmap,
// noopwithmapstreaming, parse_url_tuple, reflect2, windowingtablefunction.
private val hiveFunctions = Seq(
"hash",
"histogram_numeric",
"percentile"
)

View file

@ -492,24 +492,6 @@ private[hive] class TestHiveQueryExecution(
}
}
private[hive] class TestHiveFunctionRegistry extends SimpleFunctionRegistry {
private val removedFunctions =
collection.mutable.ArrayBuffer.empty[(String, (ExpressionInfo, FunctionBuilder))]
def unregisterFunction(name: String): Unit = synchronized {
functionBuilders.remove(name).foreach(f => removedFunctions += name -> f)
}
def restore(): Unit = synchronized {
removedFunctions.foreach {
case (name, (info, builder)) => registerFunction(name, info, builder)
}
}
}
private[hive] class TestHiveSessionState(
sparkSession: TestHiveSparkSession)
extends HiveSessionState(sparkSession) { self =>
@ -525,16 +507,6 @@ private[hive] class TestHiveSessionState(
}
}
override lazy val functionRegistry: TestHiveFunctionRegistry = {
// We use TestHiveFunctionRegistry at here to track functions that have been explicitly
// unregistered (through TestHiveFunctionRegistry.unregisterFunction method).
val fr = new TestHiveFunctionRegistry
org.apache.spark.sql.catalyst.analysis.FunctionRegistry.expressions.foreach {
case (name, (info, builder)) => fr.registerFunction(name, info, builder)
}
fr
}
override def executePlan(plan: LogicalPlan): TestHiveQueryExecution = {
new TestHiveQueryExecution(sparkSession, plan)
}