From e679bc3c1cd418ef0025d2ecbc547c9660cac433 Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Fri, 5 Aug 2016 15:52:02 -0700 Subject: [PATCH] [SPARK-16901] Hive settings in hive-site.xml may be overridden by Hive's default values ## What changes were proposed in this pull request? When we create the HiveConf for metastore client, we use a Hadoop Conf as the base, which may contain Hive settings in hive-site.xml (https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala#L49). However, HiveConf's initialize function basically ignores the base Hadoop Conf and always its default values (i.e. settings with non-null default values) as the base (https://github.com/apache/hive/blob/release-1.2.1/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java#L2687). So, even a user put javax.jdo.option.ConnectionURL in hive-site.xml, it is not used and Hive will use its default, which is jdbc:derby:;databaseName=metastore_db;create=true. This issue only shows up when `spark.sql.hive.metastore.jars` is not set to builtin. ## How was this patch tested? New test in HiveSparkSubmitSuite. Author: Yin Huai Closes #14497 from yhuai/SPARK-16901. --- .../sql/hive/client/HiveClientImpl.scala | 24 +++++- .../spark/sql/hive/HiveSparkSubmitSuite.scala | 80 +++++++++++++++++++ 2 files changed, 101 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index ef69ac76f2..3bf4ed5ab4 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -141,14 +141,32 @@ private[hive] class HiveClientImpl( // so we should keep `conf` and reuse the existing instance of `CliSessionState`. originalState } else { - val hiveConf = new HiveConf(hadoopConf, classOf[SessionState]) + val hiveConf = new HiveConf(classOf[SessionState]) + // 1: we set all confs in the hadoopConf to this hiveConf. + // This hadoopConf contains user settings in Hadoop's core-site.xml file + // and Hive's hive-site.xml file. Note, we load hive-site.xml file manually in + // SharedState and put settings in this hadoopConf instead of relying on HiveConf + // to load user settings. Otherwise, HiveConf's initialize method will override + // settings in the hadoopConf. This issue only shows up when spark.sql.hive.metastore.jars + // is not set to builtin. When spark.sql.hive.metastore.jars is builtin, the classpath + // has hive-site.xml. So, HiveConf will use that to override its default values. + hadoopConf.iterator().asScala.foreach { entry => + val key = entry.getKey + val value = entry.getValue + if (key.toLowerCase.contains("password")) { + logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=xxx") + } else { + logDebug(s"Applying Hadoop and Hive config to Hive Conf: $key=$value") + } + hiveConf.set(key, value) + } // HiveConf is a Hadoop Configuration, which has a field of classLoader and // the initial value will be the current thread's context class loader // (i.e. initClassLoader at here). // We call initialConf.setClassLoader(initClassLoader) at here to make // this action explicit. hiveConf.setClassLoader(initClassLoader) - // First, we set all spark confs to this hiveConf. + // 2: we set all spark confs to this hiveConf. sparkConf.getAll.foreach { case (k, v) => if (k.toLowerCase.contains("password")) { logDebug(s"Applying Spark config to Hive Conf: $k=xxx") @@ -157,7 +175,7 @@ private[hive] class HiveClientImpl( } hiveConf.set(k, v) } - // Second, we set all entries in config to this hiveConf. + // 3: we set all entries in config to this hiveConf. extraConfig.foreach { case (k, v) => if (k.toLowerCase.contains("password")) { logDebug(s"Applying extra config to HiveConf: $k=xxx") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index 9bca720a94..dd8fec0c15 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -253,6 +253,47 @@ class HiveSparkSubmitSuite runSparkSubmit(args) } + test("SPARK-16901: set javax.jdo.option.ConnectionURL") { + // In this test, we set javax.jdo.option.ConnectionURL and set metastore version to + // 0.13. This test will make sure that javax.jdo.option.ConnectionURL will not be + // overridden by hive's default settings when we create a HiveConf object inside + // HiveClientImpl. Please see SPARK-16901 for more details. + + val metastoreLocation = Utils.createTempDir() + metastoreLocation.delete() + val metastoreURL = + s"jdbc:derby:memory:;databaseName=${metastoreLocation.getAbsolutePath};create=true" + val hiveSiteXmlContent = + s""" + | + | + | javax.jdo.option.ConnectionURL + | $metastoreURL + | + | + """.stripMargin + + // Write a hive-site.xml containing a setting of hive.metastore.warehouse.dir. + val hiveSiteDir = Utils.createTempDir() + val file = new File(hiveSiteDir.getCanonicalPath, "hive-site.xml") + val bw = new BufferedWriter(new FileWriter(file)) + bw.write(hiveSiteXmlContent) + bw.close() + + val unusedJar = TestUtils.createJarWithClasses(Seq.empty) + val args = Seq( + "--class", SetMetastoreURLTest.getClass.getName.stripSuffix("$"), + "--name", "SetMetastoreURLTest", + "--master", "local[1]", + "--conf", "spark.ui.enabled=false", + "--conf", "spark.master.rest.enabled=false", + "--conf", s"spark.sql.test.expectedMetastoreURL=$metastoreURL", + "--conf", s"spark.driver.extraClassPath=${hiveSiteDir.getCanonicalPath}", + "--driver-java-options", "-Dderby.system.durability=test", + unusedJar.toString) + runSparkSubmit(args) + } + // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly. // This is copied from org.apache.spark.deploy.SparkSubmitSuite private def runSparkSubmit(args: Seq[String]): Unit = { @@ -313,6 +354,45 @@ class HiveSparkSubmitSuite } } +object SetMetastoreURLTest extends Logging { + def main(args: Array[String]): Unit = { + Utils.configTestLog4j("INFO") + + val sparkConf = new SparkConf(loadDefaults = true) + val builder = SparkSession.builder() + .config(sparkConf) + .config("spark.ui.enabled", "false") + .config("spark.sql.hive.metastore.version", "0.13.1") + // The issue described in SPARK-16901 only appear when + // spark.sql.hive.metastore.jars is not set to builtin. + .config("spark.sql.hive.metastore.jars", "maven") + .enableHiveSupport() + + val spark = builder.getOrCreate() + val expectedMetastoreURL = + spark.conf.get("spark.sql.test.expectedMetastoreURL") + logInfo(s"spark.sql.test.expectedMetastoreURL is $expectedMetastoreURL") + + if (expectedMetastoreURL == null) { + throw new Exception( + s"spark.sql.test.expectedMetastoreURL should be set.") + } + + // HiveSharedState is used when Hive support is enabled. + val actualMetastoreURL = + spark.sharedState.asInstanceOf[HiveSharedState] + .metadataHive + .getConf("javax.jdo.option.ConnectionURL", "this_is_a_wrong_URL") + logInfo(s"javax.jdo.option.ConnectionURL is $actualMetastoreURL") + + if (actualMetastoreURL != expectedMetastoreURL) { + throw new Exception( + s"Expected value of javax.jdo.option.ConnectionURL is $expectedMetastoreURL. But, " + + s"the actual value is $actualMetastoreURL") + } + } +} + object SetWarehouseLocationTest extends Logging { def main(args: Array[String]): Unit = { Utils.configTestLog4j("INFO")